Gender bias in audience of seminars and career position

Data

EcoEncontros Seminar talks

Talks from EcoEncontros Seminar series at the Graduate Program of Ecology in the University of SĂŁo Paulo (PPGE-USP), Brazil

See file metadata.txt, in folder data for more description and detail of the dataset.

data <- read.table("data/presentations_PPGE_2008-2019.csv", sep=",",
                   header=T, as.is=T)
data$date <- dmy(data$date)
data$year <- year(data$date) 
#skimr::skim(data)

Excluding special events as round tables and discussions not related to a project or study presented by someone.

IDs <- c(154, 250, 211, 289)
data <- data %>% filter(!id %in% IDs)

For this specific analysis, excluding speakers that are not in academia (“others”), and keeping undergraduate students, MD and PhD in the group student. postdoc, professor or researcher*.

*Researchers are included in the professor categorical position (column position_cat) because all of them come from research institutions.

data <- data %>% filter(position_cat != "others")
data$position_cat <- fct_relevel(data$position_cat, "student", 
                                 "postdoc","professor")

Excluding seminars with more than one speaker

events <- data %>% count(id) %>% filter(n>1)
data <- data %>% filter(!id %in% events$id,
                        !is.na(audience_n))
dim(data)
## [1] 299  30

Data description

Audience by gender and academic position

ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  geom_boxplot()

  #geom_violin(position = position_dodge(0.8)) +
  #geom_jitter(position=position_jitterdodge(0.2),shape=21)
library(ggbeeswarm)
# outra opção
ggplot(data, aes(x=position_cat, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("") + ylab("Audience (N)")

Variation in time

ggplot(data, aes(x=date, y=audience_n, fill=gender)) +
  facet_wrap(~position_cat, ncol=1)+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  geom_smooth()+
  xlab("") + ylab("Audience (N)")

Looking for possible biases for speakers from inside and outside PPGE.

data$ppge <- ifelse(data$origin == "IB", "inside", "outside")
table(data$gender,data$ppge)
##    
##     inside outside
##   F     76      49
##   M     79      95
ggplot(data, aes(x=ppge, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("PPGE") + ylab("Audience (N)")

Looking for possible biases for speakers from Brazil and abroad.

data$brazilian <- ifelse(data$country == "Brasil", "yes", "no")
table(data$gender,data$brazilian)
##    
##      no yes
##   F  22 103
##   M  50 124
ggplot(data, aes(x=brazilian, y=audience_n, fill=gender)) +
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  geom_violin(col="black") +
  geom_quasirandom(dodge.width = 0.9, shape=21)+
  stat_summary(fun.y=median, aes(ymin=..y.., ymax=..y..),geom='errorbar', 
               width=0.8, size=0.8, position = position_dodge(width = 0.9))+
  xlab("Brazilian") + ylab("Audience (N)")

Modeling

Negative binomial

data$affirm_action <- ifelse(data$year<2018,"before", "after")
mg0 <- glm.nb(audience_n~ 1, data=data)
mg1 <- glm.nb(audience_n~ gender, data=data)
mg2 <- glm.nb(audience_n~ position_cat, data=data)
mg3 <- glm.nb(audience_n~ year, data=data)
mg3b <- glm.nb(audience_n~ affirm_action, data=data)

mg4 <- glm.nb(audience_n~ gender + position_cat, data=data)
mg5 <- glm.nb(audience_n~ gender + year, data=data)
mg5b <- glm.nb(audience_n~ gender + affirm_action, data=data)
mg6 <- glm.nb(audience_n~ year + position_cat, data=data)
mg6b <- glm.nb(audience_n~ affirm_action + position_cat, data=data)

mg7 <- glm.nb(audience_n~ gender*position_cat, data=data)
mg8 <- glm.nb(audience_n~ gender*year, data=data)
mg8b <- glm.nb(audience_n~ gender*affirm_action, data=data)
mg9 <- glm.nb(audience_n~ year*position_cat, data=data)
mg9b <- glm.nb(audience_n~ affirm_action*position_cat, data=data)

mg10 <- glm.nb(audience_n~ gender + position_cat + year, data=data)
mg10b <- glm.nb(audience_n~ gender + position_cat + affirm_action, data=data)
mg11 <- glm.nb(audience_n~ gender*position_cat + year, data=data)
mg11b <- glm.nb(audience_n~ gender*position_cat + affirm_action, data=data)
mg12 <- glm.nb(audience_n~ gender + position_cat * year, data=data)
mg12b <- glm.nb(audience_n~ gender + position_cat * affirm_action, data=data)
mg13 <- glm.nb(audience_n~ gender*year + position_cat, data=data)
mg13b <- glm.nb(audience_n~ gender*affirm_action + position_cat, data=data)


mg14 <- glm.nb(audience_n~ gender*position_cat*year, data=data)
mg14b <- glm.nb(audience_n~ gender*position_cat*affirm_action, data=data)

AICtab(mg2, mg0,mg1, mg3, mg4,mg5,mg6,mg7,mg8,mg9,mg10,mg11,mg12,mg13,mg14,
      mg3b,mg5b,mg6b,mg8b,mg9b,mg10b,mg11b,mg12b,mg13b, mg14b)
##       dAIC df
## mg11b  0.0 8 
## mg10b  1.2 6 
## mg13b  3.2 7 
## mg12b  4.4 8 
## mg11   5.7 8 
## mg10   6.1 6 
## mg6b   6.8 5 
## mg14b  7.6 13
## mg7    7.7 7 
## mg13   8.1 7 
## mg4    8.2 5 
## mg12   9.8 8 
## mg9b  10.5 7 
## mg6   10.9 5 
## mg2   13.0 4 
## mg14  14.0 13
## mg9   14.2 7 
## mg5b  34.1 4 
## mg8b  35.8 5 
## mg5   37.0 4 
## mg1   38.1 3 
## mg8   39.0 5 
## mg3b  51.1 3 
## mg3   52.6 3 
## mg0   53.4 2

Residual diagnostic

hnp::hnp(mg11b)
## Negative binomial model (using MASS package)

plot(simulateResiduals(mg11b))

hnp::hnp(mg10b)
## Negative binomial model (using MASS package)

plot(simulateResiduals(mg10b))

Models result

summary(mg11b)
## 
## Call:
## glm.nb(formula = audience_n ~ gender * position_cat + affirm_action, 
##     data = data, init.theta = 6.652451909, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8934  -0.7775  -0.1330   0.4896   3.8037  
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                    3.00465    0.06992  42.970  < 2e-16 ***
## genderM                        0.11625    0.07453   1.560  0.11882    
## position_catpostdoc            0.11708    0.10696   1.095  0.27372    
## position_catprofessor          0.20059    0.10432   1.923  0.05449 .  
## affirm_actionbefore           -0.19567    0.06316  -3.098  0.00195 ** 
## genderM:position_catpostdoc   -0.12030    0.14428  -0.834  0.40440    
## genderM:position_catprofessor  0.22897    0.12861   1.780  0.07503 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.6525) family taken to be 1)
## 
##     Null deviance: 376.12  on 298  degrees of freedom
## Residual deviance: 303.02  on 292  degrees of freedom
## AIC: 2168.1
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.652 
##           Std. Err.:  0.708 
## 
##  2 x log-likelihood:  -2152.067
performance::r2(mg11b)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.303
myg11b <- ggpredict(mg11b, terms=c("position_cat","gender", "affirm_action"))
plot(myg11b) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))

summary(mg10b)
## 
## Call:
## glm.nb(formula = audience_n ~ gender + position_cat + affirm_action, 
##     data = data, init.theta = 6.502702402, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8101  -0.7831  -0.1381   0.4704   3.9188  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)            2.98211    0.06352  46.947  < 2e-16 ***
## genderM                0.15356    0.05463   2.811  0.00494 ** 
## position_catpostdoc    0.04418    0.07250   0.609  0.54229    
## position_catprofessor  0.36571    0.06046   6.049 1.46e-09 ***
## affirm_actionbefore   -0.18918    0.06316  -2.995  0.00274 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(6.5027) family taken to be 1)
## 
##     Null deviance: 369.75  on 298  degrees of freedom
## Residual deviance: 303.07  on 294  degrees of freedom
## AIC: 2169.3
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  6.503 
##           Std. Err.:  0.688 
## 
##  2 x log-likelihood:  -2157.260
performance::r2(mg10b)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.282
myg10b <- ggpredict(mg10b, terms=c("position_cat","gender", "affirm_action"))
plot(myg10b) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))

Only professors - productivity metrics

Investigating if differences in productivity between male and female professors and researches are related to the audience.

Measured productivity publication metrics from Google Scholar for professors and researchers.

Creating productivity index using PCA 1st axis from metrics.

PCA productivity metrics

dp <- data %>% filter(!is.na(data$total_citation_n),
                      !is.na(data$nature_index_count))
table(dp$gender)
## 
##  F  M 
## 20 67

Productivity publication metrics

pca1 <- PCA(dp[, c(21:28)], graph=F)
p1 <- fviz_pca_biplot(pca1, col.ind = dp$gender, addEllipses=TRUE,
                      col.ind.sub="none",  geom="point",
                      repel = TRUE) +
  scale_color_manual(name="gender",values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(name="gender",values = c("#b2abd2", "#fdb863"))
p1

Extracting PCA 2 first axes

dp$pc1 <- pca1$ind$coord[,1]
dp$pc2 <- pca1$ind$coord[,2]

Modeling

m0 <- glm.nb(audience_n ~ 1, data=dp)
m1 <- glm.nb(audience_n ~ gender, data=dp)
m2 <- glm.nb(audience_n ~ pc1 + pc2, data=dp)

m3 <- glm.nb(audience_n ~ gender + pc1 + pc2, data=dp)
m4 <- glm.nb(audience_n ~ gender*pc1 + gender*pc2, data=dp)

AICtab(m0,m1,m2,m3,m4, base=T, weights=T)
##    AIC   dAIC  df weight
## m3 693.4   0.0 5  0.550 
## m2 695.5   2.2 4  0.186 
## m4 695.7   2.4 7  0.168 
## m1 697.2   3.8 3  0.082 
## m0 700.7   7.4 2  0.014

Residual diagnostic

Best model

hnp(m3)
## Negative binomial model (using MASS package)

plot(simulateResiduals(m3))

## Model results

summary(m3)
## 
## Call:
## glm.nb(formula = audience_n ~ gender + pc1 + pc2, data = dp, 
##     init.theta = 5.219733971, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5233  -0.7225  -0.2033   0.4610   3.3874  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  3.11594    0.11007  28.310   <2e-16 ***
## genderM      0.26225    0.12520   2.095   0.0362 *  
## pc1          0.07478    0.02511   2.978   0.0029 ** 
## pc2         -0.01698    0.03941  -0.431   0.6665    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(5.2197) family taken to be 1)
## 
##     Null deviance: 103.589  on 86  degrees of freedom
## Residual deviance:  89.206  on 83  degrees of freedom
## AIC: 693.36
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  5.220 
##           Std. Err.:  0.927 
## 
##  2 x log-likelihood:  -683.359
performance::r2(mg3)
## # R2 for Generalized Linear Regression
##   Nagelkerke's R2: 0.015
my3 <- ggpredict(m3, terms=c("pc1","gender"))
plot(my3) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
   theme_cowplot() + ggtitle("") +
  ylab("Audience (N)") + xlab("Productivity (PC1 axis)")

plot(my3) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
   theme_cowplot() + ggtitle("") +
  ylab("Audience (N)") + xlab("Productivity (PC1 axis)")

my3 <- ggpredict(m3, terms=c("pc2","gender"))
plot(my3) +
  scale_color_manual(values = c("#b2abd2", "#fdb863"))+
  scale_fill_manual(values = c("#b2abd2", "#fdb863"))+
   theme_cowplot()